{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.生成词向量\n",
    "\n",
    "下载GitHub项目：[https://github.com/stanfordnlp/GloVe/archive/master.zip](https://github.com/stanfordnlp/GloVe/archive/master.zip)\n",
    "\n",
    "解压后，进入目录执行\n",
    "\n",
    "make\n",
    "\n",
    "进行编译操作。\n",
    "\n",
    "然后执行 sh demo.sh 进行训练并生成词向量文件：vectors.txt和vectors.bin\n",
    "\n",
    "## 2. 词向量生成模型并加载"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import shutil\n",
    "import gensim"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "71291\n",
      "71291\n",
      "to --\n",
      "would 0.88327956199646\n",
      "them 0.8333861231803894\n",
      "they 0.8308783173561096\n",
      "but 0.8260759711265564\n",
      "will 0.824319064617157\n",
      "make 0.8223329782485962\n",
      "not 0.8217222690582275\n",
      "could 0.819805920124054\n",
      "it 0.8186241984367371\n",
      "this 0.8162687420845032\n",
      "\n",
      "one --\n",
      "seven 0.974886417388916\n",
      "eight 0.9729028344154358\n",
      "six 0.9693377017974854\n",
      "four 0.9586309790611267\n",
      "nine 0.9584434032440186\n",
      "five 0.9540387392044067\n",
      "three 0.9511278867721558\n",
      "two 0.9247976541519165\n",
      "zero 0.8960137963294983\n",
      "in 0.7726520299911499\n",
      "\n"
     ]
    }
   ],
   "source": [
    "def getFileLineNums(filename):  \n",
    "    f = open(filename,'r')  \n",
    "    count = 0  \n",
    "  \n",
    "    for line in f:  \n",
    "          \n",
    "        count += 1  \n",
    "    return count\n",
    " \n",
    "def prepend_line(infile, outfile, line):  \n",
    "    \"\"\" \n",
    "    Function use to prepend lines using bash utilities in Linux. \n",
    "    (source: http://stackoverflow.com/a/10850588/610569) \n",
    "    \"\"\"  \n",
    "    with open(infile, 'r') as old:  \n",
    "        with open(outfile, 'w') as new:  \n",
    "            new.write(str(line) + \"\\n\")  \n",
    "            shutil.copyfileobj(old, new)\n",
    "            \n",
    "def prepend_slow(infile, outfile, line):  \n",
    "    \"\"\" \n",
    "    Slower way to prepend the line by re-creating the inputfile. \n",
    "    \"\"\"  \n",
    "    with open(infile, 'r') as fin:  \n",
    "        with open(outfile, 'w') as fout:  \n",
    "            fout.write(line + \"\\n\")  \n",
    "            for line in fin:  \n",
    "                fout.write(line) \n",
    "                \n",
    "def load(filename):  \n",
    "      \n",
    "    # Input: GloVe Model File  \n",
    "    # More models can be downloaded from http://nlp.stanford.edu/projects/glove/  \n",
    "    # glove_file=\"glove.840B.300d.txt\"  \n",
    "    glove_file = filename  \n",
    "      \n",
    "    dimensions = 50  \n",
    "      \n",
    "    num_lines = getFileLineNums(filename)  \n",
    "    # num_lines = check_num_lines_in_glove(glove_file)  \n",
    "    # dims = int(dimensions[:-1])  \n",
    "    dims = 50  \n",
    "      \n",
    "    print(num_lines)  \n",
    "        #  \n",
    "        # # Output: Gensim Model text format.  \n",
    "    gensim_file='glove_model.txt'  \n",
    "    gensim_first_line = \"{} {}\".format(num_lines, dims)  \n",
    "        #  \n",
    "        # # Prepends the line.  \n",
    "    #if platform == \"linux\" or platform == \"linux2\":  \n",
    "    prepend_line(glove_file, gensim_file, gensim_first_line)  \n",
    "    #else:  \n",
    "    #    prepend_slow(glove_file, gensim_file, gensim_first_line)  \n",
    "      \n",
    "        # Demo: Loads the newly created glove_model.txt into gensim API.  \n",
    "    model=gensim.models.KeyedVectors.load_word2vec_format(gensim_file,binary=False) #GloVe Model  \n",
    "      \n",
    "    model_name = gensim_file[6:-4]  \n",
    "          \n",
    "    model.save(model_name)  \n",
    "      \n",
    "    return model\n",
    "\n",
    "if __name__ == '__main__':  \n",
    "    myfile='GloVe-master/vectors.txt'\n",
    "    model = load(myfile)\n",
    "    \n",
    "    #############   模型加载  #######################\n",
    "    # model_name='glove_model.txt'\n",
    "    # model = gensim.models.KeyedVectors.load(model_name)  \n",
    " \n",
    "    print(len(model.vocab)) \n",
    " \n",
    "    word_list = [u'to',u'one']  \n",
    "   \n",
    "    for word in word_list:  \n",
    "        print(word,'--')\n",
    "        for i in model.most_similar(word, topn=10):  \n",
    "            print(i[0],i[1])\n",
    "        print('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
